notebook.community

Edit and run



In [1]:

    
from IPython.display import FileLink
FileLink('Titanic baby step for pandas Part 1.ipynb')









    Out[1]:




Titanic baby step for pandas Part 1.ipynb



In [2]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display_html



In [3]:

    
df = pd.read_csv('data/train.csv')
df.head()









    Out[3]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
       1
       0
       3
                                 Braund, Mr. Owen Harris
         male
       22
       1
       0
              A/5 21171
        7.2500
        NaN
       S
    
    
      1
       2
       1
       1
       Cumings, Mrs. John Bradley (Florence Briggs Th...
       female
       38
       1
       0
               PC 17599
       71.2833
        C85
       C
    
    
      2
       3
       1
       3
                                  Heikkinen, Miss. Laina
       female
       26
       0
       0
       STON/O2. 3101282
        7.9250
        NaN
       S
    
    
      3
       4
       1
       1
            Futrelle, Mrs. Jacques Heath (Lily May Peel)
       female
       35
       1
       0
                 113803
       53.1000
       C123
       S
    
    
      4
       5
       0
       3
                                Allen, Mr. William Henry
         male
       35
       0
       0
                 373450
        8.0500
        NaN
       S



In [4]:

    
df.describe()









    Out[4]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
       891.000000
       891.000000
       891.000000
       714.000000
       891.000000
       891.000000
       891.000000
    
    
      mean
       446.000000
         0.383838
         2.308642
        29.699118
         0.523008
         0.381594
        32.204208
    
    
      std
       257.353842
         0.486592
         0.836071
        14.526497
         1.102743
         0.806057
        49.693429
    
    
      min
         1.000000
         0.000000
         1.000000
         0.420000
         0.000000
         0.000000
         0.000000
    
    
      25%
       223.500000
         0.000000
         2.000000
        20.125000
         0.000000
         0.000000
         7.910400
    
    
      50%
       446.000000
         0.000000
         3.000000
        28.000000
         0.000000
         0.000000
        14.454200
    
    
      75%
       668.500000
         1.000000
         3.000000
        38.000000
         1.000000
         0.000000
        31.000000
    
    
      max
       891.000000
         1.000000
         3.000000
        80.000000
         8.000000
         6.000000
       512.329200

fill the empty age with mean value



In [5]:

    
df['AgeFixed'] = df['Age'].fillna(df['Age'].mean())

Take the salutation part



In [6]:

    
df['Salutation'] = df['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])
df.head()









    Out[6]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      AgeFixed
      Salutation
    
  
  
    
      0
       1
       0
       3
                                 Braund, Mr. Owen Harris
         male
       22
       1
       0
              A/5 21171
        7.2500
        NaN
       S
       22
         Mr
    
    
      1
       2
       1
       1
       Cumings, Mrs. John Bradley (Florence Briggs Th...
       female
       38
       1
       0
               PC 17599
       71.2833
        C85
       C
       38
        Mrs
    
    
      2
       3
       1
       3
                                  Heikkinen, Miss. Laina
       female
       26
       0
       0
       STON/O2. 3101282
        7.9250
        NaN
       S
       26
       Miss
    
    
      3
       4
       1
       1
            Futrelle, Mrs. Jacques Heath (Lily May Peel)
       female
       35
       1
       0
                 113803
       53.1000
       C123
       S
       35
        Mrs
    
    
      4
       5
       0
       3
                                Allen, Mr. William Henry
         male
       35
       0
       0
                 373450
        8.0500
        NaN
       S
       35
         Mr

here is how salutation looks like



In [7]:

    
df.groupby('Salutation')['PassengerId'].count()









    Out[7]:





Salutation
Capt              1
Col               2
Don               1
Dr                7
Jonkheer          1
Lady              1
Major             2
Master           40
Miss            182
Mlle              2
Mme               1
Mr              517
Mrs             125
Ms                1
Rev               6
Sir               1
the Countess      1
Name: PassengerId, dtype: int64

remove the Capt / Don part



In [8]:

    
s = df.groupby('Salutation')['PassengerId'].count()
s.sort(ascending=False)
valid_vals = s[:4].index
df['SalutationNew'] = df['Salutation'].apply(lambda x: x in valid_vals and x or 'Others')
df.boxplot('Age', by='SalutationNew', figsize=(20, 4))









    Out[8]:





<matplotlib.axes.AxesSubplot at 0x7f07e481db10>

SalutationNew 和 Age 关系比较显著



In [9]:

    
df.boxplot('Age', by=['Sex', 'Pclass'], figsize=(20, 4))









    Out[9]:





<matplotlib.axes.AxesSubplot at 0x7f07e47c99d0>

Sex/Pclass 和 Age 的关系也比较显著所以可以用 Sex/Pclass/SalutationNew 这三个变量矫正缺失值



In [10]:

    
# 翻转矩阵 算出 Sex/Pclass 下每个 SalutationNew 的 Age 中位数值 用来填充缺失 Age
t = df.pivot_table(values='Age', index='SalutationNew', columns=['Sex', 'Pclass'], aggfunc=np.median)
display_html(t)
df['AgeFixed'] = df['Age']
df['AgeFixed'].fillna(df[df['Age'].isnull()].apply(lambda x: t[x['Sex']][x['Pclass']][x['SalutationNew']], axis=1), inplace=True)
df.describe()









    






  
    
      Sex
      female
      male
    
    
      Pclass
      1
      2
      3
      1
      2
      3
    
    
      SalutationNew
      
      
      
      
      
      
    
  
  
    
      Master
        NaN
      NaN
      NaN
        4
        1.0
        4
    
    
      Miss
       30.0
       24
       18
      NaN
        NaN
      NaN
    
    
      Mr
        NaN
      NaN
      NaN
       40
       31.0
       26
    
    
      Mrs
       41.5
       32
       31
      NaN
        NaN
      NaN
    
    
      Others
       28.5
       28
      NaN
       49
       46.5
      NaN
    
  








    Out[10]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
      AgeFixed
    
  
  
    
      count
       891.000000
       891.000000
       891.000000
       714.000000
       891.000000
       891.000000
       891.000000
       891.000000
    
    
      mean
       446.000000
         0.383838
         2.308642
        29.699118
         0.523008
         0.381594
        32.204208
        29.149461
    
    
      std
       257.353842
         0.486592
         0.836071
        14.526497
         1.102743
         0.806057
        49.693429
        13.503296
    
    
      min
         1.000000
         0.000000
         1.000000
         0.420000
         0.000000
         0.000000
         0.000000
         0.420000
    
    
      25%
       223.500000
         0.000000
         2.000000
        20.125000
         0.000000
         0.000000
         7.910400
        21.000000
    
    
      50%
       446.000000
         0.000000
         3.000000
        28.000000
         0.000000
         0.000000
        14.454200
        26.000000
    
    
      75%
       668.500000
         1.000000
         3.000000
        38.000000
         1.000000
         0.000000
        31.000000
        36.750000
    
    
      max
       891.000000
         1.000000
         3.000000
        80.000000
         8.000000
         6.000000
       512.329200
        80.000000

找出异常的 Fare 数据并用正常临界值代替掉



In [11]:

    
fig = plt.pyplot.figure(figsize=(30, 4))
ax = fig.add_subplot(121)
ax.set_title('before fix')

s = df['Fare'].copy()
s.hist()
n = 4
outhigh_fare = s[s - s.mean() >= n * s.std()]
outlow_fare = s[s.mean() - s >= n * s.std()]
if outhigh_fare.count():
    s[s >= outhigh_fare.max()] = s[s < outhigh_fare.min()].max()
if outlow_fare.count():
    s[s <= outhigh_fare.min()] = s[s > outhigh_fare.max()].min()

ax = fig.add_subplot(122)
ax.set_title('after fix')
s.hist()

# ax = fig.add_subplot(133)
# ax.set_title('after fix')
df['FareFixed'] = s
df.boxplot(['Fare', 'FareFixed'], by='Pclass', figsize=(30, 4))









    Out[11]:





array([<matplotlib.axes.AxesSubplot object at 0x7f07e4341c90>,
       <matplotlib.axes.AxesSubplot object at 0x7f07e430de90>], dtype=object)



In [11]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

Sex	female			male
Pclass	1	2	3	1	2	3
SalutationNew
Master	NaN	NaN	NaN	4	1.0	4
Miss	30.0	24	18	NaN	NaN	NaN
Mr	NaN	NaN	NaN	40	31.0	26
Mrs	41.5	32	31	NaN	NaN	NaN
Others	28.5	28	NaN	49	46.5	NaN